MSDA - Bootcamp 2025 Summer
KT Wong
kwanto@hku.hk
Faculty of Social Sciences, HKU
2025-08-07
it starts from the grammar of graphics Wickham (2016)
| Variable | Description |
|---|---|
| manufacturer | Car manufacturer |
| model | Car model |
| year | Year of manufacture |
| displ | Engine displacement (litres) |
| hwy | Miles per gallon (highway) |
| cty | Miles per gallon (city) |
| cyl | Number of cylinders |
| drv | Drive type (f = front, r = rear, 4 = 4wd) |
| class | Type of car |
| trans | Type of transmission |
| fl | Fuel type |
Colour, size, shape and other aesthetic attributes
library(ggthemes)
ggplot(mpg,
aes(displ, hwy)) +
geom_point(aes(color=class)) +
labs(x="Engine size (litres)",
y="Highway fuel economy (miles per gallon)",
title="Relationship between engine size and fuel economy",
color="Car type",
caption="Source: mpg dataset")+
theme_economist()+
scale_color_tableau() +
theme(
axis.title.x = element_text(margin = margin(t = 10)),
axis.title.y = element_text(margin = margin(r = 10))
)# A tibble: 6 × 8
partner year partner_name product product_name US_report_import pop2000
<chr> <dbl> <chr> <dbl> <chr> <dbl> <dbl>
1 ARE 1998 United Arab Emira… 950341 "Toys repre… 1.06 3.25e6
2 ARE 2000 United Arab Emira… 950349 "Toys repre… 12.0 3.25e6
3 ARE 2003 United Arab Emira… 950349 "Toys repre… 4.65 3.25e6
4 ARE 2005 United Arab Emira… 950320 "Reduced-si… 49.2 3.25e6
5 ARG 1996 Argentina 950341 "Toys repre… 0 3.69e7
6 ARG 1996 Argentina 950310 "Electric t… 10.8 3.69e7
# ℹ 1 more variable: region <dbl>
# A tibble: 5 × 2
partner_name total_import
<chr> <dbl>
1 China 26842305.
2 Denmark 1034990.
3 Canada 572309.
4 Hong Kong, China 545186.
5 Switzerland 400969.
top5_partners=c("China", "Denmark", "Canada", "Hong Kong, China", "Switzerland")
options(scipen = 999)
library(ggthemes)
library(scales)
library(plotly)
p <- toy_imports %>%
filter(partner_name %in% top5_partners) %>%
group_by(year, partner_name) %>%
summarize(total_import=sum(US_report_import)) %>%
ggplot(aes(year, total_import, color=partner_name)) +
geom_line(size=1.18)+
labs(title="Toy imports from the U.S.'s top-5 partners, 1996-2005",
x="Year",
y="Dollar value of imports (log scale)",
color="Import Region")+
scale_x_continuous(breaks=1996:2005)+
scale_y_log10(
breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(.x))) +
#labels = trans_format("log10", math_format(10^.x))) +
theme_economist()+
theme(
plot.title = element_text(size = 16, face = "bold", hjust = 0.5, margin = margin(b = 15)), # Larger, bold, centered title
axis.title.x = element_text(size = 14, margin = margin(t = 10)), # Larger x-axis label
axis.title.y = element_text(size = 14, margin = margin(r = 15)), # Larger y-axis label with right margin for spacing
axis.text = element_text(size = 12), # Larger tick labels
axis.ticks.y = element_line(color = "black", size = 0.5), # Clearer y-ticks
axis.ticks.length.y = unit(0.3, "cm"), # Slightly longer y-ticks for prominence
legend.title = element_text(size = 12), # Larger legend title
legend.text = element_text(size = 10) # Larger legend text
)
ggplotly(p)library(tidyverse)
rapidcity <- read_csv("https://raw.githubusercontent.com/kwan-MSDA/Bootcamp_2024/main/dataset/rapidcity.csv")
top_5_coldest <- rapidcity %>%
group_by(Year, Month) %>%
summarize(avg_Temp = mean(Temp),
lowest_temp = min(Temp),
highest_temp = max(Temp)) %>%
arrange(avg_Temp) %>%
round(1) %>%
mutate(Month_Year = paste(month.abb[Month], Year, sep="-"), .after = Month) %>%
head(n=5)
top_5_coldest# A tibble: 5 × 6
# Groups: Year [4]
Year Month Month_Year avg_Temp lowest_temp highest_temp
<dbl> <dbl> <chr> <dbl> <dbl> <dbl>
1 1996 1 Jan-1996 14.9 -11 46.1
2 2009 12 Dec-2009 16.4 -2.6 35.6
3 2000 12 Dec-2000 17.3 -9 38.8
4 1996 12 Dec-1996 17.5 -10.8 40.4
5 2001 2 Feb-2001 17.6 -3.9 40.8
# Reshape data to long format
top_5_long <- top_5_coldest %>%
pivot_longer(cols = c(avg_Temp, lowest_temp, highest_temp),
names_to = "Temp_Type",
values_to = "Temperature") %>%
mutate(Temp_Type = factor(Temp_Type,
levels = c("lowest_temp", "avg_Temp", "highest_temp"),
labels = c("Lowest", "Average", "Highest")))
# Create the ggplot
p <- ggplot(top_5_long, aes(x = Month_Year, y = Temperature, fill = Temp_Type)) +
geom_bar(stat = "identity", position = position_dodge(width = 1), alpha = 0.6) +
scale_fill_manual(values = c("Lowest" = "purple", "Average" = "blue", "Highest" = "red")) +
labs(title = "Top 5 Coldest Months in Rapid City",
x = "Month-Year",
y = "Temperature (°F)",
fill = "Temperature Type") + # Add legend title
theme_minimal() +
theme(
legend.title = element_text(face = "bold", size = 12, color = "black"), # Style legend title
legend.text = element_text(size = 10), # Style legend text
axis.text.x = element_text(angle = 45, hjust = 1) # Rotate x-axis labels
)
# Convert to interactive plotly plot
ggplotly(p, tooltip = c("x", "y", "fill"))Q: how did survival among adult passengers vary by sex and cabin class?
# A tibble: 6 × 5
name survived sex age passengerClass
<chr> <chr> <chr> <dbl> <chr>
1 Allen, Miss. Elisabeth Walton yes female 29 1st
2 Allison, Master. Hudson Trevor yes male 0.917 1st
3 Allison, Miss. Helen Loraine no female 2 1st
4 Allison, Mr. Hudson Joshua Crei no male 30 1st
5 Allison, Mrs. Hudson J C (Bessi no female 25 1st
6 Anderson, Mr. Harry yes male 48 1st
# A tibble: 6 × 5
# Groups: sex [2]
sex passengerClass total_count survived survival_rate
<chr> <chr> <int> <int> <dbl>
1 female 1st 125 121 0.968
2 female 2nd 85 74 0.871
3 female 3rd 106 47 0.443
4 male 1st 144 47 0.326
5 male 2nd 143 12 0.0839
6 male 3rd 289 45 0.156
library(ggplot2)
library(ggthemes)
library(scales) # For percent formatting
class_labels <- function(passengerClass) {
dplyr::case_when( passengerClass == "1st" ~ "First Class",
passengerClass == "2nd" ~ "Second Class",
passengerClass == "3rd" ~ "Third Class",
TRUE ~ as.character(passengerClass) # Fallback for unexpected values
)
}
ggplot(surv_adults, aes(x = sex, y = survival_rate, fill = sex)) +
geom_col(position = "dodge", width = 0.45, color = "black", alpha = 0.85) + # Add outline and transparency
facet_wrap(~passengerClass, nrow = 1, labeller = labeller(passengerClass = class_labels)) + # Clear facet labels
geom_text(aes(label = scales::percent(survival_rate, accuracy = 1)),
vjust = -0.5, size = 4) + # Add percentage labels above bars
scale_y_continuous(labels = scales::percent_format(accuracy = 1),
limits = c(0, 1), expand = c(0, 0.05)) + # Y-axis as percentage, 0-1 range
scale_fill_brewer(palette = "Set2") + # Colorblind-friendly palette
labs(title = "Survival Rate by Gender and Passenger Class",
y = "Survival Rate",
x = "Gender") +
theme_economist() +
theme( plot.title = element_text(size = 16, face = "bold", hjust = 0.5, margin = margin(b = 15)),
axis.title.x = element_text(size = 12, margin = margin(t = 10)),
axis.title.y = element_text(size = 12, margin = margin(r = 10)),
axis.text.x = element_text(size = 10, margin = margin(t = 5)),
axis.text.y = element_text(size = 10, margin = margin(r = 5)),
strip.text = element_text(size = 11, face = "bold", margin = margin(t = 5, r = 5, b = 10, l = 5)), # Increased bottom margin
panel.spacing = unit(1.5, "lines"), # Increased space between facets
legend.position = "none" )# install.packages('devtools')
#devtools::install_github('bbc/bbplot'))
library(ggpubr)
source("https://raw.githubusercontent.com/kwan-MSDA/R/main/bbc_style.R")
gapminder %>%
group_by(year, continent) %>%
summarize(median_lifeExp = median(lifeExp)) %>%
ggplot(aes(year, median_lifeExp, color=continent)) +
geom_line()+
labs(title="Life expectancy by continent and year",
x="Year",
y="Life expectancy")+
bbc_style()library("ggalt")
library("tidyr")
library(gapminder)
dumbbell_df <- gapminder %>%
filter(year == 1967 | year == 2007) %>%
select(country, year, lifeExp) %>%
spread(year, lifeExp) %>%
mutate(gap = `2007` - `1967`) %>%
arrange(desc(gap)) %>%
head(10)
#Make plot
ggplot(dumbbell_df, aes(x = `1967`, xend = `2007`, y = reorder(country, gap), group = country)) +
geom_dumbbell(colour = "#dddddd",
size = 3,
colour_x = "#FAAB18",
colour_xend = "#1380A1") +
bbc_style() +
labs(title="We're living longer",
subtitle="Biggest life expectancy rise, 1967-2007")library(hrbrthemes)
library(viridis)
gapminder %>%
filter(year==2007) %>%
mutate(country=factor(country, levels=unique(country))) %>%
arrange(desc(pop)) %>%
ggplot(aes(x=gdpPercap, y=lifeExp, size=pop, fill=continent)) +
geom_point(alpha=0.6, shape=21, color="black")+
scale_size(range=c(.1, 24), name="Population (M)")+
scale_fill_viridis(discrete=TRUE, guide=FALSE, option="A")+
theme_ipsum()+
theme(legend.position="none")+
labs(title="Life expectancy by continent in 2007",
x="GDP per capita",
y="Life Expectancy")library(gganimate)
gapminder %>%
ggplot(aes(x=gdpPercap, y=lifeExp, size=pop, fill=continent, frame=year)) +
geom_point(alpha=0.6, shape=21, color="black")+
scale_size(range=c(.1, 22), name="Population (M)")+
scale_fill_viridis(discrete=TRUE, guide=FALSE, option="A")+
theme_ipsum()+
theme(legend.position="none")+
labs(title="Life expectancy by continent in {frame_time}",
x="GDP per capita",
y="Life Expectancy")+
geom_text(data=gapminder %>% filter(pop >1e+8), aes(label=country), size=5, nudge_x=0.1, nudge_y=0.1)+
transition_time(year)+
enter_fade()+
exit_fade()library(plotly)
library(hrbrthemes)
library(viridis)
g<- crosstalk::SharedData$new(gapminder %>%
mutate(country=factor(country, levels=unique(country))) %>%
arrange(desc(pop)),
~ continent)
gg<- g %>%
ggplot(aes(x=gdpPercap, y=lifeExp, fill=continent, frame=year)) +
geom_point(aes(size=pop, alpha=0.6, ids=country))+
scale_size(range=c(.1, 24), name="Population (M)")+
scale_fill_viridis(discrete=TRUE, guide=FALSE, option="A")+
scale_alpha(range=c(0.6, 1), guide=FALSE)+
theme_ipsum()+
# theme(legend.position="none")+
labs(title="Life expectancy by continent between 1952-2007",
x="GDP per capita",
y="Life Expectancy")
ggplotly(gg, height = 500, width = 800)